In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings("ignore")
import seaborn as sns
import plotly.express as px
In [3]:
#import datasets
File = pd.read_csv(r"C:\Users\AMIT KUMAR\Downloads\Unemployment in India (1).csv",header=0)
In [4]:
#View first 10 rows
File.head(10)
Out[4]:
Region Date Frequency Estimated Unemployment Rate (%) Estimated Employed Estimated Labour Participation Rate (%) Area
0 Andhra Pradesh 31-05-2019 Monthly 3.65 11999139.0 43.24 Rural
1 Andhra Pradesh 30-06-2019 Monthly 3.05 11755881.0 42.05 Rural
2 Andhra Pradesh 31-07-2019 Monthly 3.75 12086707.0 43.50 Rural
3 Andhra Pradesh 31-08-2019 Monthly 3.32 12285693.0 43.97 Rural
4 Andhra Pradesh 30-09-2019 Monthly 5.17 12256762.0 44.68 Rural
5 Andhra Pradesh 31-10-2019 Monthly 3.52 12017412.0 43.01 Rural
6 Andhra Pradesh 30-11-2019 Monthly 4.12 11397681.0 41.00 Rural
7 Andhra Pradesh 31-12-2019 Monthly 4.38 12528395.0 45.14 Rural
8 Andhra Pradesh 31-01-2020 Monthly 4.84 12016676.0 43.46 Rural
9 Andhra Pradesh 29-02-2020 Monthly 5.91 11723617.0 42.83 Rural
In [5]:
#Display the dataframes
File.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 7 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Region                                    740 non-null    object 
 1    Date                                     740 non-null    object 
 2    Frequency                                740 non-null    object 
 3    Estimated Unemployment Rate (%)          740 non-null    float64
 4    Estimated Employed                       740 non-null    float64
 5    Estimated Labour Participation Rate (%)  740 non-null    float64
 6   Area                                      740 non-null    object 
dtypes: float64(3), object(4)
memory usage: 42.1+ KB
In [6]:
#Check for missing or null values
File.isna().sum()
Out[6]:
Region                                      28
 Date                                       28
 Frequency                                  28
 Estimated Unemployment Rate (%)            28
 Estimated Employed                         28
 Estimated Labour Participation Rate (%)    28
Area                                        28
dtype: int64
In [7]:
#dropping missing values
File = File.dropna()
File.isna().sum()
Out[7]:
Region                                      0
 Date                                       0
 Frequency                                  0
 Estimated Unemployment Rate (%)            0
 Estimated Employed                         0
 Estimated Labour Participation Rate (%)    0
Area                                        0
dtype: int64
In [8]:
#checking columns
File.columns
Out[8]:
Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
       ' Estimated Employed', ' Estimated Labour Participation Rate (%)',
       'Area'],
      dtype='object')
In [9]:
#stripping the spaces  infront of the columns
File.columns = File.columns.str.strip()
In [10]:
File.columns
Out[10]:
Index(['Region', 'Date', 'Frequency', 'Estimated Unemployment Rate (%)',
       'Estimated Employed', 'Estimated Labour Participation Rate (%)',
       'Area'],
      dtype='object')
In [11]:
#checking for duplicates
File.duplicated().sum()
Out[11]:
0
In [12]:
#checking unique values
File["Frequency"].unique()
Out[12]:
array([' Monthly', 'Monthly'], dtype=object)
In [13]:
# Strip the empty space to get 1 unique value
File["Frequency"] = File["Frequency"].str.strip()
File["Frequency"].unique()
Out[13]:
array(['Monthly'], dtype=object)
In [14]:
# Check unique values for the date column
File["Date"].unique()
Out[14]:
array([' 31-05-2019', ' 30-06-2019', ' 31-07-2019', ' 31-08-2019',
       ' 30-09-2019', ' 31-10-2019', ' 30-11-2019', ' 31-12-2019',
       ' 31-01-2020', ' 29-02-2020', ' 31-03-2020', ' 30-04-2020',
       ' 31-05-2020', ' 30-06-2020'], dtype=object)
In [15]:
# Strip the empty space to get 1 unique value
File["Date"] = File["Date"].str.strip()
File["Date"].unique()
Out[15]:
array(['31-05-2019', '30-06-2019', '31-07-2019', '31-08-2019',
       '30-09-2019', '31-10-2019', '30-11-2019', '31-12-2019',
       '31-01-2020', '29-02-2020', '31-03-2020', '30-04-2020',
       '31-05-2020', '30-06-2020'], dtype=object)
In [17]:
# Dropping the "Frequency" column
files = File.drop("Frequency", axis=1)
In [18]:
# Changing the date datatype from object to datetime
files['Date'] = pd.to_datetime(files["Date"], format="%d-%m-%Y")
In [19]:
# Create year-month column for aggregation
files['Year_Month'] = files['Date'].dt.strftime('%Y-%m')
In [20]:
# Create Estimated Employed Rate column
files['Estimated Employed Rate (%)'] = (files['Estimated Labour Participation Rate (%)'] - files['Estimated Unemployment Rate (%)'])
files
Out[20]:
Region Date Estimated Unemployment Rate (%) Estimated Employed Estimated Labour Participation Rate (%) Area Year_Month Estimated Employed Rate (%)
0 Andhra Pradesh 2019-05-31 3.65 11999139.0 43.24 Rural 2019-05 39.59
1 Andhra Pradesh 2019-06-30 3.05 11755881.0 42.05 Rural 2019-06 39.00
2 Andhra Pradesh 2019-07-31 3.75 12086707.0 43.50 Rural 2019-07 39.75
3 Andhra Pradesh 2019-08-31 3.32 12285693.0 43.97 Rural 2019-08 40.65
4 Andhra Pradesh 2019-09-30 5.17 12256762.0 44.68 Rural 2019-09 39.51
... ... ... ... ... ... ... ... ...
749 West Bengal 2020-02-29 7.55 10871168.0 44.09 Urban 2020-02 36.54
750 West Bengal 2020-03-31 6.67 10806105.0 43.34 Urban 2020-03 36.67
751 West Bengal 2020-04-30 15.63 9299466.0 41.20 Urban 2020-04 25.57
752 West Bengal 2020-05-31 15.22 9240903.0 40.67 Urban 2020-05 25.45
753 West Bengal 2020-06-30 9.86 9088931.0 37.57 Urban 2020-06 27.71

740 rows × 8 columns

In [21]:
# since we have a Year-Month column, we dropped the date
files = files.drop("Date", axis=1)
files
Out[21]:
Region Estimated Unemployment Rate (%) Estimated Employed Estimated Labour Participation Rate (%) Area Year_Month Estimated Employed Rate (%)
0 Andhra Pradesh 3.65 11999139.0 43.24 Rural 2019-05 39.59
1 Andhra Pradesh 3.05 11755881.0 42.05 Rural 2019-06 39.00
2 Andhra Pradesh 3.75 12086707.0 43.50 Rural 2019-07 39.75
3 Andhra Pradesh 3.32 12285693.0 43.97 Rural 2019-08 40.65
4 Andhra Pradesh 5.17 12256762.0 44.68 Rural 2019-09 39.51
... ... ... ... ... ... ... ...
749 West Bengal 7.55 10871168.0 44.09 Urban 2020-02 36.54
750 West Bengal 6.67 10806105.0 43.34 Urban 2020-03 36.67
751 West Bengal 15.63 9299466.0 41.20 Urban 2020-04 25.57
752 West Bengal 15.22 9240903.0 40.67 Urban 2020-05 25.45
753 West Bengal 9.86 9088931.0 37.57 Urban 2020-06 27.71

740 rows × 7 columns

In [23]:
import matplotlib.pyplot as plt
In [24]:
# Calculate average unemployment rate by year-month
unemployment_trend = files.groupby('Year_Month')['Estimated Unemployment Rate (%)'].mean().reset_index()

# Create the visualization
plt.figure(figsize=(12, 6))
plt.plot(unemployment_trend['Year_Month'], unemployment_trend['Estimated Unemployment Rate (%)'], marker='o')
plt.xticks(rotation=45)
plt.title('Average Unemployment Rate Trend in India')
plt.xlabel('Year_Month')
plt.ylabel('Unemployment Rate (%)')
plt.grid(True)
plt.tight_layout()
plt.show()


print("\
average unemployment rate:")
print(unemployment_trend)
No description has been provided for this image
average unemployment rate:
   Year_Month  Estimated Unemployment Rate (%)
0     2019-05                         8.874259
1     2019-06                         9.303333
2     2019-07                         9.033889
3     2019-08                         9.637925
4     2019-09                         9.051731
5     2019-10                         9.900909
6     2019-11                         9.868364
7     2019-12                         9.497358
8     2020-01                         9.950755
9     2020-02                         9.964717
10    2020-03                        10.700577
11    2020-04                        23.641569
12    2020-05                        24.875294
13    2020-06                        11.903600
In [25]:
plt.figure(figsize=(13, 9))

# Calculate average unemployment rate by Region
Region_umemployment_rate = files.groupby('Region')['Estimated Unemployment Rate (%)'].mean().reset_index()

# Creating a column chart
sns.barplot(x=Region_umemployment_rate['Region'], y=Region_umemployment_rate['Estimated Unemployment Rate (%)'])

# Adding title and labels
plt.title('Average Unemployment Rate by Region')
plt.xlabel('Region')
plt.ylabel('Estimated Unemployment Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add horizontal gridlines

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')  # Rotate by 45 degrees and align to the right

# Adjusting layout and displaying the plot
plt.tight_layout()
plt.show()

print("\
Average Unemployment Rate by Region:")
print(Region_umemployment_rate)
No description has been provided for this image
Average Unemployment Rate by Region:
              Region  Estimated Unemployment Rate (%)
0     Andhra Pradesh                         7.477143
1              Assam                         6.428077
2              Bihar                        18.918214
3         Chandigarh                        15.991667
4       Chhattisgarh                         9.240357
5              Delhi                        16.495357
6                Goa                         9.274167
7            Gujarat                         6.663929
8            Haryana                        26.283214
9   Himachal Pradesh                        18.540357
10   Jammu & Kashmir                        16.188571
11         Jharkhand                        20.585000
12         Karnataka                         6.676071
13            Kerala                        10.123929
14    Madhya Pradesh                         7.406429
15       Maharashtra                         7.557500
16         Meghalaya                         4.798889
17            Odisha                         5.657857
18        Puducherry                        10.215000
19            Punjab                        12.031071
20         Rajasthan                        14.058214
21            Sikkim                         7.249412
22        Tamil Nadu                         9.284286
23         Telangana                         7.737857
24           Tripura                        28.350357
25     Uttar Pradesh                        12.551429
26       Uttarakhand                         6.582963
27       West Bengal                         8.124643
In [26]:
# Calculate Area Estimated Unemployment total
urban = files.groupby('Area')['Estimated Unemployment Rate (%)'].sum().reset_index()

fig = px.pie(urban,names="Area",values='Estimated Unemployment Rate (%)',
              color_discrete_sequence=["BLACK","GREEN"],title="Rural-Urban percentage Uemployment rate", hole=0.5)

# Set the size of the chart
fig.update_layout(width=1200,  height=600)
fig.show()

print("\
Area Estimated Unemployment total (%):")
print(urban)
Area Estimated Unemployment total (%):
    Area  Estimated Unemployment Rate (%)
0  Rural                          3706.60
1  Urban                          5016.48
In [27]:
plt.figure(figsize=(13, 9))

# Calculate average Estimated Labour Participation Rate (%) by region
Region_Labour_Participation_Rate = files.groupby('Region')['Estimated Labour Participation Rate (%)'].mean().reset_index()

# Creating a column chart
sns.barplot(x=Region_Labour_Participation_Rate['Region'], y=Region_Labour_Participation_Rate['Estimated Labour Participation Rate (%)'])

# Adding title and labels
plt.title(' Estimated Labour Participation Rate (%) by Region')
plt.xlabel('Region')
plt.ylabel('Estimated Labour Participation Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add horizontal gridlines

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')  # Rotate by 45 degrees and align to the right

# Adjusting layout and displaying the plot
plt.tight_layout()
plt.show()

# Display cluster sizes and centers
print("\
Data Quality Check:")
print("Region_Labour_Participation_Rate:")
print(Region_Labour_Participation_Rate)
No description has been provided for this image
Data Quality Check:
Region_Labour_Participation_Rate:
              Region  Estimated Labour Participation Rate (%)
0     Andhra Pradesh                                39.375714
1              Assam                                44.868462
2              Bihar                                38.153929
3         Chandigarh                                39.336667
4       Chhattisgarh                                42.810714
5              Delhi                                38.929643
6                Goa                                39.249583
7            Gujarat                                46.101071
8            Haryana                                42.737143
9   Himachal Pradesh                                44.222143
10   Jammu & Kashmir                                41.030952
11         Jharkhand                                41.670714
12         Karnataka                                41.345357
13            Kerala                                34.867857
14    Madhya Pradesh                                38.821429
15       Maharashtra                                42.303214
16         Meghalaya                                57.080741
17            Odisha                                38.926429
18        Puducherry                                38.992692
19            Punjab                                41.138214
20         Rajasthan                                39.973214
21            Sikkim                                46.070000
22        Tamil Nadu                                40.872143
23         Telangana                                53.002500
24           Tripura                                61.823929
25     Uttar Pradesh                                39.432500
26       Uttarakhand                                33.775556
27       West Bengal                                45.417500
In [28]:
# Calculate Estimated_Employed By Region
plt.figure(figsize=(12, 6))
Estimated_Employed_trend = files.groupby('Region')['Estimated Employed'].mean().reset_index()
sns.barplot(x = Estimated_Employed_trend['Estimated Employed'], y = Estimated_Employed_trend['Region'])
plt.title('Estimated_Employed By Region')
plt.xlabel('Estimated Employed')

plt.tight_layout()
plt.show()

print("\
Estimated_Employed By Region:")
print(Estimated_Employed_trend)
No description has been provided for this image
Estimated_Employed By Region:
              Region  Estimated Employed
0     Andhra Pradesh        8.154093e+06
1              Assam        5.354772e+06
2              Bihar        1.236619e+07
3         Chandigarh        3.168312e+05
4       Chhattisgarh        4.303499e+06
5              Delhi        2.627513e+06
6                Goa        2.263083e+05
7            Gujarat        1.140201e+07
8            Haryana        3.557072e+06
9   Himachal Pradesh        1.059824e+06
10   Jammu & Kashmir        1.799932e+06
11         Jharkhand        4.469240e+06
12         Karnataka        1.066712e+07
13            Kerala        4.425900e+06
14    Madhya Pradesh        1.111548e+07
15       Maharashtra        1.999020e+07
16         Meghalaya        6.897368e+05
17            Odisha        6.545747e+06
18        Puducherry        2.122781e+05
19            Punjab        4.539362e+06
20         Rajasthan        1.004106e+07
21            Sikkim        1.068807e+05
22        Tamil Nadu        1.226955e+07
23         Telangana        7.939663e+06
24           Tripura        7.170026e+05
25     Uttar Pradesh        2.809483e+07
26       Uttarakhand        1.390228e+06
27       West Bengal        1.719854e+07
In [29]:
# 1. Sales Distribution by Product Category
plt.figure(figsize=(12, 6))
Labour_participation_by_area = files.groupby('Area')['Estimated Labour Participation Rate (%)'].mean().reset_index()
sns.barplot(x = Labour_participation_by_area['Estimated Labour Participation Rate (%)'], y = Labour_participation_by_area['Area'])
plt.title('Labour_participation_by_area')
plt.xlabel('Estimated Labour Participation Rate')

plt.tight_layout()
plt.show()

print("\
Labour_participation_by_area:")
print(Labour_participation_by_area)
No description has been provided for this image
Labour_participation_by_area:
    Area  Estimated Labour Participation Rate (%)
0  Rural                                44.464819
1  Urban                                40.901365
In [ ]:
 
In [ ]:
 
In [ ]: